SimpleFulltextIndex.java example

Explorer
neo4j-components-svn-master
package org.neo4j.rdf.fulltext;

import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

import javax.transaction.SystemException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.neo4j.graphdb.GraphDatabaseService;
import org.neo4j.graphdb.Node;
import org.neo4j.graphdb.NotFoundException;
import org.neo4j.helpers.Predicate;
import org.neo4j.helpers.collection.FilteringIterator;
import org.neo4j.helpers.collection.IteratorUtil;
import org.neo4j.helpers.collection.PrefetchingIterator;
import org.neo4j.rdf.fulltext.PersistentQueue.Entry;
import org.neo4j.rdf.fulltext.VerificationHook.Status;
import org.neo4j.rdf.model.Uri;
import org.neo4j.util.GraphDatabaseUtil;

/**
 * A {@link FulltextIndex} using lucene.
 * The query format (see the search method) is a plain lucene query, but with
 * the addition that an AND operator is squeezed in between every word making
 * it and AND search by default, instead of OR.
 * 
 * When you call the index and removeIndex methods a temporary log is created
 * and a call to the end method will write all those additions to the queue
 * to be indexed in the near future. The "txId" i.e. transaction id is really
 * just the javax.transaction.Transaction object's hashCode() value at the
 * moment. That is what you'll have to pass in to the
 * end( boolean commit, int txId ) method if you choose not to use the
 * end( boolean commit ) method which figures it out itself, provided that
 * you are in a transaction at the time of the call.
 */
public class SimpleFulltextIndex implements FulltextIndex
{
    /**
     * The literal node id
     */
    private static final String KEY_ID = "id";
    private static final String KEY_INDEX = "index";
    private static final String KEY_PREDICATE = "predicate";
    private static final String KEY_INDEX_SOURCE = "index_source";
    private static final String SNIPPET_DELIMITER = "...";
    private static final int BATCH_SIZE = 100;
    
    private LiteralReader literalReader = new SimpleLiteralReader();
    private String directoryPath;
    private String queuePath;
    private Directory directory;
    private Analyzer analyzer = new Analyzer()
    {
        @Override
        public TokenStream tokenStream( String fieldName, Reader reader )
        {
            return new LowerCaseFilter( new WhitespaceTokenizer( reader ) );
        }
    };
    private GraphDatabaseService graphDb;
    private GraphDatabaseUtil graphDbUtil;
    private Map<Integer, Collection<Object[]>> toIndex =
        Collections.synchronizedMap(
            new HashMap<Integer, Collection<Object[]>>() );
    private PersistentQueue indexingQueue;
    private IndexingThread indexingThread;
    private Formatter highlightFormatter;
    private Set<String> predicateFilter;
    private IndexSearcher indexSearcher;
    
    public SimpleFulltextIndex( GraphDatabaseService graphDb, File storagePath )
    {
        this( graphDb, storagePath, null );
    }
    
    public SimpleFulltextIndex( GraphDatabaseService graphDb, File storagePath,
        Collection<String> predicateFilter )
    {
        this( graphDb, storagePath, null, null, predicateFilter );
    }
    
    public SimpleFulltextIndex( GraphDatabaseService graphDb, File storagePath,
        String highlightPreTag, String highlightPostTag,
        Collection<String> predicateFilter )
    {
        if ( highlightPreTag == null || highlightPostTag == null )
        {
            this.highlightFormatter = new SimpleHTMLFormatter();
        }
        else
        {
            this.highlightFormatter = new SimpleHTMLFormatter(
                highlightPreTag, highlightPostTag );
        }
        
        this.predicateFilter = predicateFilter == null ? null :
            new HashSet<String>( predicateFilter );
        this.directoryPath = storagePath.getAbsolutePath();
        this.queuePath = this.directoryPath + "-queue";
        this.graphDb = graphDb;
        this.graphDbUtil = new GraphDatabaseUtil( graphDb );
        startUpDirectoryAndThread();
    }
    
    private void startUpDirectoryAndThread()
    {
        this.indexingQueue = new PersistentQueue( new File( queuePath ) );
        this.indexingQueue.setAutoCompleteEntries( false );
        try
        {
            cleanWriteLocks( new File( directoryPath ) );
            createLuceneDirectory();
        }
        catch ( IOException e )
        {
            throw new RuntimeException( e );
        }
        
        this.indexingThread = new IndexingThread();
        this.indexingThread.start();
    }
    
    private void cleanWriteLocks( File path )
    {
        if ( !path.isDirectory() )
        {
            return;
        }
        for ( File file : path.listFiles() )
        {
            if ( file.isDirectory() )
            {
                cleanWriteLocks( file );
            }
            else if ( file.getName().equals( "write.lock" ) )
            {
                boolean success = file.delete();
                assert success;
            }
        }
    }
    
    public void clear()
    {
        internalShutDown();
        delete();
        startUpDirectoryAndThread();
    }
    
    private void createLuceneDirectory() throws IOException
    {
        if ( !IndexReader.indexExists( directoryPath ) )
        {
            new File( directoryPath ).mkdirs();
            IndexWriter writer = new IndexWriter( directoryPath, analyzer,
                true, MaxFieldLength.UNLIMITED );
            writer.close();
        }
        directory = FSDirectory.getDirectory( directoryPath );
        if ( IndexWriter.isLocked( directory ) )
        {
            IndexWriter.unlock( directory );
        }
    }
    
    private Directory getDir() throws IOException
    {
        return this.directory;
    }
    
    private IndexWriter getWriter( boolean create ) throws IOException
    {
        return new IndexWriter( getDir(), analyzer, create,
            MaxFieldLength.UNLIMITED );
    }
    
    public void index( Node node, Uri predicate, Object literal )
    {
        index( node.getId(), predicate.getUriAsString(), literal );
    }
    
    private void index( long nodeId, String predicate, Object literal )
    {
        enqueueCommand( true, nodeId, predicate, literal );
    }
    
    private void enqueueCommand( boolean trueForIndex,
        long nodeId, String predicate, Object literal )
    {
        if ( predicateFilter != null &&
            !predicateFilter.contains( predicate ) )
        {
            return;
        }
        
        try
        {
            int key =
                graphDbUtil.getTransactionManager().getTransaction().hashCode();
            Collection<Object[]> commands = toIndex.get( key );
            if ( commands == null )
            {
                commands = new ArrayList<Object[]>();
                toIndex.put( key, commands );
            }
            commands.add( new Object[] {
                trueForIndex, nodeId, predicate, literal
            } );
        }
        catch ( SystemException e )
        {
            throw new RuntimeException( e );
        }
    }
    
    protected void safeClose( Object object )
    {
        try
        {
            if ( object != null )
            {
                if ( object instanceof IndexWriter )
                {
                    ( ( IndexWriter ) object ).close();
                }
                else if ( object instanceof IndexReader )
                {
                    ( ( IndexReader ) object ).close();
                }
                else if ( object instanceof IndexSearcher )
                {
                    ( ( IndexSearcher ) object ).close();
                }
                else
                {
                    throw new RuntimeException( object.getClass().getName() );
                }
            }
        }
        catch ( IOException e )
        {
            e.printStackTrace();
        }
    }
    
    private void doIndex( IndexWriter writer, long nodeId, String predicate,
        Object literal )
    {
        try
        {
            Document doc = new Document();
            doc.add( new Field( KEY_ID, String.valueOf( nodeId ), Store.YES,
                Index.NOT_ANALYZED ) );
            doc.add( new Field( KEY_INDEX, getLiteralReader().read( literal ),
                Store.YES, Index.ANALYZED ) );
            doc.add( new Field( KEY_PREDICATE, predicate,
                Store.YES, Index.NOT_ANALYZED ) );
            doc.add( new Field( KEY_INDEX_SOURCE, literal.toString(),
                Store.YES, Index.NOT_ANALYZED ) );
            writer.addDocument( doc );
        }
        catch ( IOException e )
        {
            throw new RuntimeException( e );
        }
    }
    
    public void removeIndex( Node node, Uri predicate, Object literal )
    {
        removeIndex( node.getId(), predicate.getUriAsString(), literal );
    }
    
    private void removeIndex( long nodeId, String predicate, Object literal )
    {
        enqueueCommand( false, nodeId, predicate, literal );
    }
    
    private void doRemoveIndex( IndexWriter writer,
        long nodeId, String predicate, Object literal )
    {
        try
        {
            BooleanQuery deletionQuery = new BooleanQuery();
            deletionQuery.add( new TermQuery(
                new Term( KEY_ID, String.valueOf( nodeId ) ) ), Occur.MUST );
            deletionQuery.add( new TermQuery(
                new Term( KEY_PREDICATE, predicate ) ), Occur.MUST );
            deletionQuery.add( new TermQuery(
                new Term( KEY_INDEX_SOURCE, literal.toString() ) ),
                Occur.MUST );
            
            writer.deleteDocuments( deletionQuery );
        }
        catch ( IOException e )
        {
            throw new RuntimeException( e );
        }
    }
    
    private synchronized IndexSearcher getSearcher() throws IOException
    {
        if ( this.indexSearcher == null )
        {
            this.indexSearcher = new IndexSearcher( getDir() );
        }
        else
        {
            IndexReader reopened =
                this.indexSearcher.getIndexReader().reopen();
            if ( reopened != null )
            {
                this.indexSearcher = new IndexSearcher( reopened );
            }
        }
        return this.indexSearcher;
    }
    
    private void leaveSearcher( IndexSearcher searcher )
    {
    }
    
    public Iterable<RawQueryResult> search( String query )
    {
        return searchWithSnippets( query, 0 );
    }
    
    public Iterable<RawQueryResult> searchWithSnippets( String query,
        int snippetCountLimit )
    {
        IndexSearcher searcher = null;
        try
        {
            searcher = getSearcher();
            Query q = new QueryParser( KEY_INDEX, analyzer ).parse( query );
            Hits hits = searcher.search( q, Sort.RELEVANCE );
            
            Highlighter highlighter = null;
            if ( snippetCountLimit > 0 )
            {
                highlighter = new Highlighter( highlightFormatter,
                    new QueryScorer( q ) );
            }
            
            Iterator<RawQueryResult> resultIterator =
                new ResultIterator( hits, snippetCountLimit,
                    highlighter );
            return IteratorUtil.asIterable( resultIterator );
        }
        catch ( IOException e )
        {
            throw new RuntimeException( e );
        }
        catch ( ParseException e )
        {
            throw new RuntimeException( e );
        }
        finally
        {
            leaveSearcher( searcher );
        }
    }
    
    private static Predicate<RawQueryResult> OK_RESULT = new Predicate<RawQueryResult>()
    {
        public boolean accept( RawQueryResult result )
        {
            return result != null && result != SPECIAL_FILTERING_INSTANCE;
        }
    };
    
    private class ResultIterator extends FilteringIterator<RawQueryResult>
    {
        ResultIterator( Hits hits, int snippetCountLimit,
            Highlighter highlighter )
        {
            super( new RawResultIterator( hits, snippetCountLimit,
                highlighter ), OK_RESULT );
        }
    }
    
    private static final RawQueryResult SPECIAL_FILTERING_INSTANCE =
        new RawQueryResult( null, 0, null );
    
    private class RawResultIterator extends PrefetchingIterator<RawQueryResult>
    {
        private Hits hits;
        private int hitsLength;
        private int snippetCountLimit;
        private Highlighter highlighter;
        private int counter = 0;
        private Set<Long> ids = new HashSet<Long>();
        
        private long getIdTime = 0;
        private long getSnippetTime = 0;
        private long getNodeTime = 0;
        
        RawResultIterator( Hits hits, int snippetCountLimit,
            Highlighter highlighter )
        {
            this.hits = hits;
            this.hitsLength = hits.length();
            this.snippetCountLimit = snippetCountLimit;
            this.highlighter = highlighter;
        }
        
        @Override
        protected RawQueryResult fetchNextOrNull()
        {
            int docNum = counter;
            if ( counter >= hitsLength )
            {
                return null;
            }
            
            counter++;
            try
            {
                long t = System.currentTimeMillis();
                Document doc = hits.doc( docNum );
                long id = Long.parseLong( doc.get( KEY_ID ) );
                getIdTime += ( System.currentTimeMillis() - t );
                if ( !ids.add( id ) )
                {
                    // It's a duplicate here, probably after a crash or
                    // something
                    removeDuplicate( doc );
                    return SPECIAL_FILTERING_INSTANCE;
                }
                float score = hits.score( docNum );
                
                String snippet = null;
                t = System.currentTimeMillis();
                if ( docNum < snippetCountLimit )
                {
                    snippet = generateSnippet( doc, highlighter );
                }
                getSnippetTime += ( System.currentTimeMillis() - t );
                
                try
                {
                    t = System.currentTimeMillis();
                    Node node = graphDb.getNodeById( id );
                    getNodeTime += ( System.currentTimeMillis() - t );
                    return new RawQueryResult( node, score, snippet );
                }
                catch ( NotFoundException e )
                {
                    // Ok, probably index lagging a bit behind, that's all.
                    // This also effectively hides many bugs, which is a
                    // BAAD thing.
                    return SPECIAL_FILTERING_INSTANCE;
                }
            }
            catch ( IOException e )
            {
                throw new RuntimeException( e );
            }
        }
    }
    
    private void removeDuplicate( Document doc )
    {
        long nodeId = Long.parseLong( doc.get( KEY_ID ) );
        String predicate = doc.get( KEY_PREDICATE );
        String literal = doc.get( KEY_INDEX_SOURCE );
        removeIndex( nodeId, predicate, literal );
        index( nodeId, predicate, literal );
    }
    
    private String generateSnippet( Document doc, Highlighter highlighter )
    {
        StringBuffer snippet = new StringBuffer();
        for ( Field field : doc.getFields( KEY_INDEX ) )
        {
            String text = field.stringValue();
            TokenStream tokenStream = analyzer.tokenStream( KEY_INDEX,
                new StringReader( text ) );
            try
            {
                String fragment = highlighter.getBestFragments(
                    tokenStream, text, 2, SNIPPET_DELIMITER );
                if ( snippet.length() > 0 )
                {
                    snippet.append( SNIPPET_DELIMITER );
                }
                snippet.append( fragment );
            }
            catch ( IOException e )
            {
                // TODO
                continue;
            }
            catch ( InvalidTokenOffsetsException e )
            {
                // TODO
                continue;
            }
        }
        return snippet.toString();
    }
    
    public boolean verify( VerificationHook hook, String queryOrNullForAll )
    {
        IndexSearcher searcher = null;
        try
        {
            searcher = new IndexSearcher( getDir() );
            Map<Status, MutableInteger> counts =
                new HashMap<Status, MutableInteger>();
            int maxDoc = 0;
            final IndexReader reader = searcher.getIndexReader();
            Iterator<Integer> hitsIterator = null;
            if ( queryOrNullForAll == null )
            {
                maxDoc = reader.maxDoc();
                hitsIterator = new PrefetchingIterator<Integer>()
                {
                    private int limit = reader.maxDoc();
                    private int counter;
                    
                    @Override
                    protected Integer fetchNextOrNull()
                    {
                        int c = counter++;
                        return c < limit ? c : null;
                    }
                };
            }
            else
            {
                Query q = new QueryParser( KEY_INDEX, analyzer ).parse(
                    queryOrNullForAll );
                final Hits hits = searcher.search( q, Sort.RELEVANCE );
                maxDoc = hits.length();
                hitsIterator = new PrefetchingIterator<Integer>()
                {
                    private int counter;
                    
                    @Override
                    protected Integer fetchNextOrNull()
                    {
                        try
                        {
                            int c = counter++;
                            return c < hits.length() ? hits.id( c ) : null;
                        }
                        catch ( IOException e )
                        {
                            throw new RuntimeException( e );
                        }
                    }
                };
            }
            
            hook.verificationStarting( maxDoc );
            while ( hitsIterator.hasNext() )
            {
                int docId = hitsIterator.next();
                if ( reader.isDeleted( docId ) )
                {
                    hook.oneWasSkipped();
                    continue;
                }
                
                Document doc = reader.document( docId );
                long nodeId = Long.parseLong( doc.get( KEY_ID ) );
                Status status = hook.verify( nodeId,
                    doc.get( KEY_PREDICATE ), doc.get( KEY_INDEX_SOURCE ) );
                MutableInteger count = counts.get( status );
                if ( count == null )
                {
                    count = new MutableInteger();
                    counts.put( status, count );
                }
                count.value++;
            }
            
            Map<Status, Integer> resultCounts = new HashMap<Status, Integer>();
            int errors = 0;
            for ( Map.Entry<Status, MutableInteger> count :
                counts.entrySet() )
            {
                resultCounts.put( count.getKey(), count.getValue().value );
                errors += ( count.getKey() == Status.OK ? 0 :
                    count.getValue().value );
            }
            hook.verificationCompleted( resultCounts );
            return errors == 0;
        }
        catch ( ParseException e )
        {
            throw new RuntimeException( e );
        }
        catch ( IOException e )
        {
            throw new RuntimeException( e );
        }
        finally
        {
            safeClose( searcher );
        }
    }
    
    private static class MutableInteger
    {
        private int value;
    }
    
    public LiteralReader getLiteralReader()
    {
        return this.literalReader;
    }
    
    public void setLiteralReader( LiteralReader reader )
    {
        this.literalReader = reader;
    }
    
    public void end( boolean commit )
    {
        try
        {
            end( graphDbUtil.getTransactionManager().getTransaction().hashCode(),
                commit );
        }
        catch ( SystemException e )
        {
            throw new RuntimeException( e );
        }
    }
    
    public void end( int txId, boolean commit )
    {
        Collection<Object[]> commands = toIndex.remove( txId );
        if ( commands == null || !commit )
        {
            return;
        }
        
        for ( Object[] command : commands )
        {
            this.indexingQueue.add( command );
            this.indexingThread.hasItems = true;
        }
    }
    
    public boolean queueIsEmpty()
    {
        return !this.indexingThread.hasItems;
    }
    
    public void shutDown()
    {
//        TemporaryLogger.getLogger().info( getClass().getName() +
//            " shutDown called", new Exception() );
        internalShutDown();
    }
    
    private void internalShutDown()
    {
        indexingThread.halt();
        try
        {
            indexingThread.join();
        }
        catch ( InterruptedException e )
        {
            e.printStackTrace();
        }
        
        indexingQueue.close();
        try
        {
            directory.close();
        }
        catch ( IOException e )
        {
            throw new RuntimeException( e );
        }
        safeClose( this.indexSearcher );
        this.indexSearcher = null;
    }
    
    private class IndexingThread extends Thread
    {
        private boolean halted;
        private boolean hasItems;
        private IndexWriter writer;
        private Collection<Entry> entriesToComplete = new ArrayList<Entry>();
        
        private void halt()
        {
            this.halted = true;
        }
        
        @Override
        public void run()
        {
            while ( !halted )
            {
                try
                {
                    hasItems = indexingQueue.hasNext();
                    while ( !halted && hasItems )
                    {
                        Entry entry = indexingQueue.next();
                        Object[] data = entry.data();
                        ensureWriters();
                        if ( ( Boolean ) data[ 0 ] )
                        {
                            doIndex( writer, ( Long ) data[ 1 ],
                                ( String ) data[ 2 ], data[ 3 ] );
                        }
                        else
                        {
                            doRemoveIndex( writer, ( Long ) data[ 1 ],
                                ( String ) data[ 2 ], data[ 3 ] );
                        }
                        entriesToComplete.add( entry );
                        
                        if ( entriesToComplete.size() >= BATCH_SIZE ||
                            !indexingQueue.hasNext() )
                        {
                            flushEntries();
                        }
                        hasItems = indexingQueue.hasNext();
                    }
                    
                    // This is so that it flushes if the indexer gets halted.
                    flushEntries();
                    
                    try
                    {
                        long time = System.currentTimeMillis();
                        while ( !halted &&
                            System.currentTimeMillis() - time < 100 )
                        {
                            hasItems = indexingQueue.hasNext();
                            Thread.sleep( 20 );
                        }
                    }
                    catch ( InterruptedException e )
                    {
                        Thread.interrupted();
                    }
                }
                catch ( Throwable t )
                {
                    t.printStackTrace();
                }
            }
        }
        
        private void ensureWriters() throws Exception
        {
            if ( writer == null )
            {
                writer = getWriter( false );
                writer.setMaxBufferedDocs( BATCH_SIZE * 2 );
                writer.setMaxBufferedDeleteTerms( BATCH_SIZE * 2 );
            }
        }
        
        private void flushEntries()
        {
            if ( writer == null )
            {
                return;
            }
            
            safeClose( writer );
            writer = null;
//            try
//            {
//                writer.commit();
//                
//            }
//            catch ( IOException e )
//            {
//                TemporaryLogger.getLogger().info(
//                    "Couldn't commit fulltext index writer ", e );
//                safeClose( writer );
//                writer = null;
//            }
            indexingQueue.markAsCompleted( entriesToComplete.toArray(
                new Entry[ entriesToComplete.size() ] ) );
            entriesToComplete.clear();
        }
    }
    
    private void delete()
    {
        deleteDir( new File( directoryPath ) );
        new File( queuePath ).delete();
    }
    
    protected void deleteDir( File dir )
    {
        if ( !dir.exists() )
        {
            return;
        }
        
        for ( File child : dir.listFiles() )
        {
            if ( child.isFile() )
            {
                child.delete();
            }
            else
            {
                deleteDir( child );
            }
        }
        dir.delete();
    }
}